home *** CD-ROM | disk | FTP | other *** search
- /* file "merge_indices.c" ... by ^z -- 870823-...
- * function to merge sorted indices together repeatedly until finished
- * with them all in a single set of *.k/*.p files ...
- *
- * The merging strategy is straightforward enough:
- * Let "g" denote the generation_number and "f" denote the file_number.
- * Temporary file names begin with the letter x, which is then followed
- * by a generation number (decimal), the letter k or p (standing for
- * key or ptr, respectively), and then the file number (decimal). Thus,
- * the file "x0k0" is the keys file #0 for generation #0 (the starting,
- * pre-merging, generation), file "x2p3" is the ptr file #3 for generation
- * #2, etc.
- *
- * (The following discussion is specifically for a 2-way merge ... but
- * the generalization for N-way merging is straightforward.)
- *
- * On a given call to merge_indices, the following may happen:
- * - files xgkf/xgpf and xgk(f+1)/xgp(f+1) are merged into files
- * x(g+1)k(f/2)/x(g+1)p(f/2), and then the parent files are
- * deleted;
- * - file xgkf isn't found, which means we are done with this
- * generation and must go on to the next;
- * - file xgk(f+1) isn't found, which means that either we are
- * completely done with the merging work (if f=0) and just
- * have to rename the files xgkf/xgpf into the correct final
- * names (that is, doc_filename.k/doc_filename.p), or else
- * (if f>0) we have an odd number of files for this level
- * of merging, and therefore just have to rename xgkf/xgpf
- * to x(g+1)k(f/2)/x(g+1)p(f/2).
- */
-
-
- #include <stdio.h>
- #include <unix.h>
- #include <storage.h>
- #include <strings.h>
- #include <ctype.h>
- #include <proto.h>
- #include "qndxr.2.h"
-
- int merge_indices (doc_filename)
- char *doc_filename;
- {
- static int generation_number = 0, file_number = 0;
- FILE *ink[NMERGE], *inp[NMERGE], *outk, *outp, *open_inkfile(),
- *open_inpfile(), *open_outkfile(), *open_outpfile();
- void fix_oddball_file_name(), fix_final_file_names(), merge_kpfiles(),
- remove_used_infiles(), close_infiles();
- long inwords, indistinctwords, outdistinctwords, file_size();
- int i, n;
-
- for (n = 0; n < NMERGE; ++n)
- {
- ink[n] = open_inkfile (generation_number, file_number + n);
- if (ink[n] == NULL)
- break;
- inp[n] = open_inpfile (generation_number, file_number + n);
- }
-
- if (file_number + n == 1)
- {
- DEBUG ("--All finished with merging files!\n", NULL);
- printf ("\nFinished with index sorting! Final file sizes:\n");
- printf ("%9ld bytes of distinct key words\n", file_size (ink[0]));
- printf ("%9ld bytes of pointers to words\n", file_size (inp[0]));
- close_infiles (ink, inp, n);
- fix_final_file_names (generation_number, doc_filename);
- return (FALSE);
- }
-
- if (n < 2)
- {
- DEBUG ("--finished generation_number=%d ", generation_number);
- DEBUG ("-- file_number=%d\n", file_number);
- if (n == 1)
- {
- close_infiles (ink, inp, n);
- fix_oddball_file_name (generation_number, file_number);
- }
- ++generation_number;
- file_number = 0;
- return (TRUE);
- }
-
- outk = open_outkfile (generation_number, file_number);
- outp = open_outpfile (generation_number, file_number);
-
- inwords = 0;
- indistinctwords = 0;
- for (i = 0; i < n; ++i)
- {
- inwords += file_size (inp[i]) / sizeof(long);
- indistinctwords += file_size (ink[i]) / sizeof(KEY_REC);
- }
- printf ("Merge pass #%d.%d\n", generation_number, file_number / NMERGE);
- printf ("Input files contain %ld words -- %ld distinct words...\n",
- inwords, indistinctwords);
-
- nway_merge_kpfiles (ink, inp, outk, outp, n);
-
- outdistinctwords = file_size (outk) / sizeof(KEY_REC);
- printf (" ... merged result has %ld distinct words.\n",
- outdistinctwords);
-
- close_infiles (ink, inp, n);
- fclose (outk);
- fclose (outp);
- remove_used_infiles (generation_number, file_number, n);
-
- file_number += NMERGE;
-
- return (TRUE);
- }
-
-